

****PROGRAM BY: Modified from 00_PULL_EC_data.sas TERESA FORT********;
****WRITTEN ON 6-4-18************;


/*
NOTES:
This programs pulls in data from the economic census to make an establishment level dataset with information on 
sales, value added, employment, industry. For CMF and CWH it also adds inputs. 

This was adapted from Teresa 00_PULL_EC_data.sas code which was from the previous project. It is very similar to her SAS_01_Make_Census_data.sas code
but addes in 2012. I also don't merge in the information from the LBD at this stage since I will merge everything (BEA, census, imports) to the LBD 
     NOTE: SHE DOES SEEM TO PULL IN INFORMATION FROM THE LBD


**Last modified on: 04/10/19
**Last modified by: tf and jd


* modified Jack 02/16/2023 for replication
*/


******************************************************************************;

***The first macro brings in the sales data by industry from the Censuses;
***The second macro brings in employment data from the LBD;

***This program makes a dataset for each census year with sales, emp & industry, and with lbdnum and lbd emp;



options obs=max;
*options obs=100000;



*Raw census files ;
* yyyy is the location of the pdata command provided by CES ;
 %include 'yyyy/pdata.sas' ;



*xxx is the Project Root Folder ;
libname input 'xxx/input' ;
libname tempdata "xxxx/junk/";
libname out "xxxx/data/";

%let output=xxxx/data ;
%let ver= _c201600 ;

libname output "xxxx/data/";

/*  
 NOTE:
  symbolic links to lbd, fk_naics, and lfftd files point to latest versions
  current version is 201601 for lfttd and 201600 for lbd/fk naics
*/

****MACRO 1: BRING IN SALES DATA FROM THE CENSUSES;
************************************************************************************;

*****Make firm sales and employment;

**Get firm sales from the censuses;
***************************************************************;
**Define yearly lists;
%let ccn_1992=act ate cfn einum sic tr va cm;   /* same list for all early years */
%let ccn_1997=ate cfn einum naics tr va cm sic4; 
%let ccn_2002=actvstat alpha ein empavt naicold naicnew rcptot survu_id tabstat valaddc cstmprt;
%let ccn_2007=alpha ein empavg naics_old naics_new sales_t survu_id tabbed valaddc cp;
%let ccn_2012=alpha ein empavg naicnew naicold rcptot survu_id tabstat valaddc cstmprt; 
*ccn2012 is titled ccn.ccn2012_geo, 139k obs, roughly similar to the 145k obs in 2007;
*cp is cost of materials and parts, cstmprt is the same;
*cstcmt is selected costs, total, we do not use this;
*edited to include cstmpri for ccn2002;

%let cfi_1992=alpha cfn ein empl sales sic tabbed; *sic char6;  /* First year of cfi is 1992  */
%let cfi_1997=alpha cfn ein empl naics_new sales sic tabbed; *sic char6;
%let cfi_2002=active alpha ein empl naics_new naics_old sales survu_id tabbed;
%let cfi_2007=active alpha ein empl naics_old naics_new sales sales_t survu_id tabbed;
%let cfi_2012=active alpha ein empl naics_new naics_old sales sales_t survu_id tabbed;

%let cmf_1992=ar bridge cfn cm ei  te tvs va ; /* same list for all early years */
%let cmf_1997=ar bridge cfn ei exp ind nind te tvs va ; 
%let cmf_1997=bridge cfn ei  ind nind te tvs va fie fib tib tie mib mie wie wib ww oe ;
%let cmf_2002=ar active alpha ein ipt naics_new oe survu_id tabbed te tvs va tib tie fib fie mib mie wie wib ww ;
%let cmf_2007=ar alpha ein income_a ipt ivab ivae tib tie fib fie mib mie naics_old naics_new oe pay_ann_a sales_a
              sales_t survu_id tabbed te tvmc tvs va oe ow ww wie wib ;
%let cmf_2012=ar alpha ein income_a ipt ivab ivae tib tie fib fie mib mie naics_old naics_new oe pay_ann_a sales_a
              sales_t survu_id tabbed te tvmc tvs va oe ow ww wie wib ;


*Census of Mining: Note that there are also production and non-production workers here: ww;
%let cmi_1987=cfn ein cm nind oind te va vr vs ww;  /* first year of cmi  */
%let cmi_1992=cfn ein cm ind te tvps tvs va; *ind char4;
%let cmi_1997= cfn6 ein nind6 te tvps va cm oind; *oind char4;
%let cmi_2002=actvstat alpha ein empq1 naicold naicnew rcptot survu_id tabstat valaddm cstmtot;
%let cmi_2007=ein alpha naics_old naics_new tvps survu_id tabbed te tvs va cm ;
%let cmi_2012=alpha  ein empq1 empsmao  naicold naicnew rcptot survu_id tabstat valaddm cstmtot;

*using these for utilities, wholesale, services and retail;
%let rem_1977=alpha cfn ein ein_num_estabs empl firmid sales sic ;  /* works for 1977 and 1982  (no tabbed variable those years ); */
%let rem_1992=alpha cfn ein ein_num_estabs empl firmid sales sic tabbed;
%let rem_1997=alpha cfn ein empl naics_new sic sales tabbed; *sic char 6;
%let rem_2002=active alpha ein empl naics_new naics_old sales survu_id tabbed;
%let rem_2007=active alpha ein empl naics_old naics_new sales survu_id tabbed;
%let rem_2012=active alpha ein empl naics_old naics_new sales survu_id tabbed;


*Census of Aux; /* only needed for 92 and 97, after which exists in csr --not clear this is correct */
%let aux_1977= alpha cfn  ein femp  fsal fsls fsls_f  va sic4 ;
%let aux_1982 = alpha cfn ein femp femp_f  fsal fssp fsls fsls_f sic4  ; /*no va*/
%let aux_1987 = act alpha cfn ein femp87  fsal fsls87 fsls_f  sic sic87 splitter ;  /*  note there are also what i think are reported variables for all of these */
%let aux_1992 = act cfn ein fein f92sic femp92 fsls92 sic splitter srat;
%let aux_1997 = cfn ein naics sic c032 c010;
%let aux_2002= active alpha ein firmid naics_aux_new naics_new parent_id sales revenue survu_id tabbed ;
%let aux_2007= active alpha ein firmid naics_aux_new naics_new parent_id sales revenue: survu_id tabbed ;



**Lists for expenses and costs, etc.;
%let costs_cmf_2012=cw cp cm cost_goodsold_A ee cr cf exp ccmpq cexso cdapr sw  cmea cmec cbe cme cmeo ctemp ecom_sales fipsst cou invfor nummats numprods  tce tpw ipt vr  ;
%let costs_cwh_2012=gross_margin gross_profit merch merch_r opexp opexp_r weight state_fips sales_t optype;

%let costs_cmf_2007=cw cp cm cost_goodsold_A ee cr cf exp ccmpq cexso cdapr sw  cmea cmec cbe cme cmeo ctemp ecom_sales fipsst cou invfor nummats numprods  tce tpw ipt vr  ;
%let costs_cwh_2007=gross_margin gross_profit merch merch_r opexp opexp_r weight state_fips sales_t optype;

%let costs_cmf_2002=cw cp cm cost_goodsold_A ee cr cf exp sw  cmea cmec cbe cme cmeo  ecom_sales fipsst cou nummats numprods tce tpw vr;
%let costs_cwh_2002=gross_margin gross_profit merch merch_r opexp opexp_r weight state_fips sales_t optype;

%let costs_cmf_1997=cw cp cm ee cr cf cet crm cpc cs exp fcm1 fcm2  sw  fipsst cou ne nummats numprods rm seqnmats tce tdcp tme tpw tr tvmc tvpsd ub ue um ipt vr  ;
%let costs_cwh_1997=gross_margin gross_profit merch merch_r opexp opexp_r weight state_fips  optype;




%macro firm_sales(y);
*Pull in pre-2002 census data;
%if &y<2001 %then %do;

   
%if &y=1977 %then %do;	
data census&y;
  set pdata.ccn&y     (keep=&ccn_1992     in=a)
      pdata.cmf&y     (keep=&cmf_1992   in=c rename=(ei=ein)) 
      pdata.crt&y.base(keep=&rem_1977    in=e)
      pdata.csr&y.base(keep=&rem_1977    in=f rename=(ein=ein_t))
      pdata.cwh&y.base(keep=&rem_1977     in=h)
      pdata.aux&y	    (keep=&&aux_&y     in=x );
    *Create variable to keep track of what file the record came from ;
       if a=1        then flag_a=1 ;
         else if b=1 then flag_b=1 ;
         else if c=1 then flag_c=1 ;
         else if d=1 then flag_d=1 ;
         else if e=1 then flag_e=1 ;
         else if f=1 then flag_f=1 ;
         else if g=1 then flag_g=1 ;
         else if h=1 then flag_h=1 ;
	else if x=1 then flag_x=1 ;
      
       *Make year-specific variables;
	if x=1 then sales=fsls;
	if x=1 then emp=femp;
	
    if tabbed~='N';
run;
%end;

%if &y=1982 %then %do;	
data census&y;
  set pdata.ccn&y     (keep=&ccn_1992     in=a)
      pdata.cmf&y     (keep=&cmf_1992    in=c rename=(ei=ein)) 
      pdata.crt&y.base(keep=&rem_1977    in=e)
      pdata.csr&y.base(keep=&rem_1977    in=f rename=(ein=ein_t))
      pdata.cwh&y.base(keep=&rem_1977     in=h)
      pdata.aux&y	    (keep=&&aux_&y     in=x );
    *Create variable to keep track of what file the record came from ;
       if a=1        then flag_a=1 ;
         else if b=1 then flag_b=1 ;
         else if c=1 then flag_c=1 ;
         else if d=1 then flag_d=1 ;
         else if e=1 then flag_e=1 ;
         else if f=1 then flag_f=1 ;
         else if g=1 then flag_g=1 ;
         else if h=1 then flag_h=1 ;
	else if x=1 then flag_x=1 ;
   *Make year-specific variables;
   if x=1 then sales=fsls;
   if x=1 then emp=femp;

    if tabbed~='N';
run;
%end;


%if &y=1987 %then %do;	
data census&y;
  set pdata.ccn&y     (keep=&ccn_1992     in=a)
      pdata.cmf&y     (keep=&cmf_1992    in=c rename=(ei=ein)) 
      pdata.cmi&y     (keep=&&cmi_&y    in=d )
      pdata.crt&y.base(keep=&rem_1992     in=e)
      pdata.csr&y.base(keep=&rem_1992     in=f rename=(ein=ein_t))
      pdata.cut&y.base(keep=&rem_1992     in=g) 
      pdata.cwh&y.base(keep=&rem_1992     in=h)
      pdata.aux&y	    (keep=&&aux_&y     in=x );
    *Create variable to keep track of what file the record came from ;
       if a=1        then flag_a=1 ;
         else if b=1 then flag_b=1 ;
         else if c=1 then flag_c=1 ;
         else if d=1 then flag_d=1 ;
         else if e=1 then flag_e=1 ;
         else if f=1 then flag_f=1 ;
         else if g=1 then flag_g=1 ;
         else if h=1 then flag_h=1 ;
	else if x=1 then flag_x=1 ; 
   *Make year-specific variables;
	if x=1 then sales=fsls87;
	if d=1 then sales=vs;
	
    if tabbed~='N';
run;
%end;

 
%if &y=1992 %then %do;	
data census&y;
  set pdata.ccn&y     (keep=&&ccn_&y     in=a)
      pdata.cfi&y.base(keep=&&cfi_&y     in=b)
      pdata.cmf&y     (keep=&&cmf_&y     in=c rename=(ei=ein)) /* what happens to the ind in cmf? */
      pdata.cmi&y     (keep=&&cmi_&y     in=d)
      pdata.crt&y.base(keep=&&rem_&y     in=e)
      pdata.csr&y.base(keep=&&rem_&y     in=f rename=(ein=ein_t ))
      pdata.cut&y.base(keep=&&rem_&y     in=g ) 
      pdata.cwh&y.base(keep=&&rem_&y     in=h)
      pdata.aux&y	    (keep=&&aux_&y     in=x rename=(fsls92=sales femp92=emp));
    *Create variable to keep track of what file the record came from ;
       if a=1        then flag_a=1 ;
         else if b=1 then flag_b=1 ;
         else if c=1 then flag_c=1 ;
         else if d=1 then flag_d=1 ;
         else if e=1 then flag_e=1 ;
         else if f=1 then flag_f=1 ;
         else if g=1 then flag_g=1 ;
         else if h=1 then flag_h=1 ;
	else if x=1 then flag_x=1 ;

  *Make year-specific variables;
	if flag_d=1 then sales=tvs;   /*  but note that next year it chaanges to product shipments  */
    if tabbed~='N';
run;
%end;

%if &y=1997 %then %do;
data census&y ;
  set pdata.ccn&y     (keep=&&ccn_&y     in=a )
      pdata.cfi&y.base(keep=&&cfi_&y     in=b)
      pdata.cmf&y     (keep=&&cmf_&y &&costs_cmf_&y  in=c rename=(ei=ein))
      pdata.cmi&y     (keep=&&cmi_&y     in=d) 
      pdata.crt&y.base(keep=&&rem_&y     in=e)
      pdata.csr&y.base(keep=&&rem_&y     in=f rename=(ein=ein_t))
      pdata.cut&y.base(keep=&&rem_&y     in=g ) 
      pdata.cwh&y.base(keep=&&rem_&y  &&costs_cwh_&y  in=h) ;
    *  input.aux&y	    (keep=&&aux_&y     in=x);

    *Create variable to keep track of what file the record came from ;
       if a=1        then flag_a=1 ;
         else if b=1 then flag_b=1 ;
         else if c=1 then flag_c=1 ;
         else if d=1 then flag_d=1 ;
         else if e=1 then flag_e=1 ;
         else if f=1 then flag_f=1 ;
         else if g=1 then flag_g=1 ;
         else if h=1 then flag_h=1 ;
 	* else if x=1 then flag_x=1 ;
  
  *Make census-specific variables;
	* if x=1 then sales=c010;
        * if x=1 then emp=c032;
	if flag_d=1 then sales=tvps;
        if flag_c=1 then sales=tvs;
        if flag_c=1 then sales_manuf=tvs ;

  if tabbed~='N';
run;
%end;
	
 *Fix all variables for pre-2002 EC data;
 data census&y (drop=temp: einum ein_t tr  sic ate empl sic6 sic nind ind oind naics naics_new nind6 cfn6);
   length firmid $ 10 temp $ 8 temp2 $ 6;
   set census&y;	    
   
  *Make correct ein variable;
     if flag_f=1 then ein=left(trim(ein_t));
     if flag_a=1 then ein=einum;

   *Make firmid variable and alpha;
   if substr(cfn,1,1)='0' then firmid=cfn;
   if substr(cfn,1,1)~='0' then firmid=substr(cfn,1,6)||'0000';
   *fix alpha for ccn, cmf, cmi, aux ;
    if flag_a=1 & substr(cfn,1,1)~='0' then alpha=substr(firmid,1,6);
    if flag_c=1 & substr(cfn,1,1)~='0' then alpha=substr(firmid,1,6);
    if flag_d=1 & substr(cfn,1,1)~='0' then alpha=substr(firmid,1,6);
    * if flag_x=1 & substr(cfn,1,1)~='0' then alpha=substr(firmid,1,6) ; 

  *Makes sales variable;
    if flag_a=1 then sales=tr;
    if flag_c=1 then sales=tvs;

  *Make census employment variable;
   if flag_a=1 then emp_cen=ate;
   if flag_b=1 | flag_e=1 | flag_f=1 | flag_g=1 | flag_h=1 then emp_cen=empl;
   if flag_c=1 | flag_d=1 then emp_cen=te;
  *  if flag_x=1 then emp_cen = emp;

  *Make naics code ;
    naics97_census=naics_new ;
    if flag_a=1 then naics97_new=naics ;
    if flag_c=1 then naics97_new=nind ;
    if flag_d=1 then naics97_new=nind6 ;

  *Make source variable;
       if flag_a=1        then source='ccn' ;
         else if flag_b=1 then source='cfi' ;
         else if flag_c=1 then source='cmf' ;
         else if flag_d=1 then source='cmi' ;
         else if flag_e=1 then source='crt' ;
         else if flag_f=1 then source='csr' ;
         else if flag_g=1 then source='cut' ;
         else if flag_h=1 then source='cwh' ;
      *   else if flag_x=1 then source='aux' ;
run;
%end;


***Pull in 2002 Census data;
%if &y=2002 %then %do;
data census&y(drop=temp: surv: ein_num ein_l empavt empq1 empl rcptot sales_t naics_old naics_new naicnew naicold);
    length firmid $ 10 temp2 $ 10 temp3 $ 11;
  set pdata.ccn&y     (keep=&&ccn_&y     in=a rename=(ein=ein_num survu_id=surv_num))
      pdata.cfi&y.base(keep=&&cfi_&y     in=b)
      pdata.cmf&y     (keep=&&cmf_&y  &&costs_cmf_&y     in=c rename=(ein=ein_num))
      pdata.cmi&y     (keep=&&cmi_&y     in=d rename=(ein=ein_num survu_id=surv_num))
      pdata.crt&y.base(keep=&&rem_&y     in=e)
      pdata.csr&y.base(keep=&&rem_&y     in=f rename=(ein=ein_l))
      pdata.cut&y.base(keep=&&rem_&y     in=g)
      pdata.cwh&y.base(keep=&&rem_&y   &&costs_cwh_&y   in=h) ;
    *  input.aux&y     (keep=&&aux_&y     in=x)  ;
   *Make correct ein variable;
     if ein_num~=. then do;
	 ein=put(ein_num,z9.);
     end;
    if f=1 then ein=left(trim(ein_l)); /*now all variables back to ein*/
   *Make cfn variable;
    if a=1 | d=1 then do;
      temp2=surv_num;
      cfn=temp2;
      end;
    if b=1 | c=1 | e=1 | f=1 | g=1 | h=1 then cfn=survu_id;  
  *Make firmid variable;
    if alpha~=' ' then firmid=alpha||"0000";
    if alpha=' ' then firmid='0'||ein;
   *Make activity code flag;
    if a=1 | d=1 then active=actvstat;
   *Make tabbed flag;
    if a=1 | d=1 then tabbed=tabstat;

  *Make employment variable;
   if a=1 then emp_cen=empavt;
   if b=1 | e=1 | f=1 | g=1 | h=1 then emp_cen=empl;
   if c=1 then emp_cen=te;
   if d=1 then emp_cen=empq1;

  *Make naics variable;
        naics02_census=naics_new;
        if d=1 then naics02_census=naicnew;
        
   *Create variable to keep track of what file the record came from ;
       if a=1        then source='ccn' ;
         else if b=1 then source='cfi' ;
         else if c=1 then source='cmf' ;
         else if d=1 then source='cmi' ;
         else if e=1 then source='crt' ;
         else if f=1 then source='csr' ;
         else if g=1 then source='cut' ;
         else if h=1 then source='cwh' ;
       *  else if x=1 then source='aux' ;
         
   *Make sales variable;       
   if a=1 | d=1 then sales=rcptot;
   if c=1 then sales=tvs;
   

if tabbed~='N' ;  
*if tabbed~='N' & active~='N';
run;
%end;

***Pull in 2007 Census data;
%if &y=2007 %then %do;
data census&y(drop=ein_t temp: surv: ein_l naics_old sales_t sales_a naics_new empl empavg);
    length firmid $ 10 temp2 $10 ;
    set pdata.ccn&y       (keep=&&ccn_&y     in=a rename=(survu_id=temp1 ein=ein_t) )
      pdata.cfi&y.base    (keep=&&cfi_&y     in=b)
      pdata.cmf&y         (keep=&&cmf_&y  &&costs_cmf_&y   in=c rename=(ein=ein_t))
      pdata.cmi&y         (keep=&&cmi_&y     in=d rename=(survu_id=temp1 ein=ein_t))
      pdata.crt&y.base    (keep=&&rem_&y     in=e)
      pdata.csr&y.base    (keep=&&rem_&y     in=f rename=(ein=ein_l))
      pdata.cut&y.base    (keep=&&rem_&y     in=g) 
      pdata.cwh&y.base    (keep=&&rem_&y  &&costs_cwh_&y   in=h) ;
      *input.aux&y         (keep=&&aux_&y     in=x)  ;
      *cant access aux dataset on this project;
   
 *Make fixed ein variable;
    if ein_t~=. then do;
	 ein=put(ein_t,z9.);
     end;
     if f=1 then ein=left(trim(ein_l));
			  
   *Make cfn variable;
   if a=1 | d=1 then do;
       temp2=temp1;
       cfn=temp2;
       end;
    if b=1 | c=1 | e=1 | f=1 | g=1 | h=1 then cfn=survu_id; 
   *Make firmid variable;
    if alpha~=' ' then firmid=alpha||"0000";
    if alpha=' ' then firmid='0'||ein;
   *Make sales variable;
    if a=1 then sales=sales_t;
    if c=1 | d=1 then sales=tvs;

   *Make naics variable;
        naics07_census=naics_new;
        naics02_census=naics_old ;

   *Make employment variable;
    if a=1 then emp_cen=empavg;
    if b=1 | e=1 | f=1 | g=1 | h=1 then emp_cen=empl;
    if c=1 | d=1 then emp_cen=te;
 
   *Create variable to keep track of what file the record came from ;
       if a=1        then source='ccn' ;
         else if b=1 then source='cfi' ;
         else if c=1 then source='cmf' ;
         else if d=1 then source='cmi' ;
         else if e=1 then source='crt' ;
         else if f=1 then source='csr' ;
         else if g=1 then source='cut' ;
         else if h=1 then source='cwh' ;
       *  else if x=1 then source='aux' ;
if tabbed~='N' ;
*if tabbed~='N' & active~='N';
run;
%end;

***Pull in 2012 Census data (Preliminary geo data);
%if &y=2012 %then %do;
data census&y(drop=temp: surv: ein_t ein_l tabstat rcptot naicnew naicold naics_new naics_old empavg empl empq1);
    length firmid $ 10 temp2 $10;
    set pdata.ccn&y._final (keep=&&ccn_&y     in=a rename=(survu_id=temp1 ein=ein_t) )
      pdata.cfi&y.base   (keep=&&cfi_&y     in=b)
      pdata.cmf&y        (keep=&&cmf_&y &&costs_cmf_&y  in=c rename=(ein=ein_t))
      pdata.cmi&y._final (keep=&&cmi_&y     in=d rename=(survu_id=temp1 ein=ein_t))
      pdata.crt&y.base   (keep=&&rem_&y     in=e)
      pdata.csr&y.base   (keep=&&rem_&y     in=f rename=(ein=ein_l))
      pdata.cut&y.base   (keep=&&rem_&y     in=g) 
      pdata.cwh&y.base   (keep=&&rem_&y &&costs_cwh_&y    in=h);
   
 *Make fixed ein variable;
    if ein_t~=. then do;
	 ein=put(ein_t,z9.);
     end;
     if f=1 then ein=left(trim(ein_l));
 
   *Make cfn variable;
   if a=1 | d=1 then do;
       temp2=temp1;
       cfn=temp2;
       end;
    if b=1 | c=1 | e=1 | f=1 | g=1 | h=1 then cfn=survu_id; 

   *Make firmid variable;
    if alpha~=' ' then firmid=alpha||"0000";
    if alpha=' ' then firmid='0'||ein;

   *Make tabbed flag;
    if a=1 | d=1 then tabbed=tabstat;

   *Make sales variable;
    if a=1 | d=1 then sales=rcptot;
    if c=1 then sales=tvs;

   *Make naics variable;
        naics12_census=naics_new;

   *Make employment variable;
    if a=1 then emp_cen=empavg;
    if b=1 | e=1 | f=1 | g=1 | h=1 then emp_cen=empl;
   if c=1 then emp_cen=te;
   if d=1 then emp_cen=empsmao;

   *Create variable to keep track of what file the record came from ;
       if a=1        then source='ccn' ;
         else if b=1 then source='cfi' ;
         else if c=1 then source='cmf' ;
         else if d=1 then source='cmi' ;
         else if e=1 then source='crt' ;
         else if f=1 then source='csr' ;
         else if g=1 then source='cut' ;
         else if h=1 then source='cwh' ;
       *  else if x=1 then source='aux' ;
if tabbed~='N' ;
*if tabbed~='N' & active~='N';
run;
%end;



    *below is Teresas duplicate code; 
    proc sort data=census&y;
	by cfn ;
    run;
    data census&y dupes&y;
	set census&y;
	by cfn;
	if first.cfn=1 & last.cfn=1 then output census&y;
	else output dupes&y;
    run;
    data dupes_add;
	set dupes&y;
	if source='cmf';
    run;
    proc sort nodupkey data=dupes_add;
	by cfn ;
    run;
    data census&y;
	set census&y dupes_add;
    run;


/*
	 *export to stata;
           proc export data=census&y
            outfile="&output/estabs_sales&y..dta"
            dbms=dta replace;
            run;
*/
%mend;

/*
%firm_sales(1977);
%firm_sales(1982);
%firm_sales(1987);
%firm_sales(1992);
*/
*%firm_sales(1997);
*%firm_sales(2002);
%firm_sales(2007);
*%firm_sales(2012);


***MACRO 2: GET EMPLOYMENT and PAYROLL DATA FROM THE LBD (Expands the scope of data coverage);
************************************************************************************;

**Make firm employment share data;
%macro firm_emp(y,ind);
    proc sort data=pdata.lbd&y.c&ver.(keep=lbdnum cfn firmid emp naics sic bestnaics bestsic pay state county zip flagb cbp
                                rename=(sic=sic1 firmid=firmid_lbd naics=naics_lbd emp=emp_lbd pay=pay_lbd state=state_lbd 
county=county_lbd zip=zip9_lbd)) out=lbd;
	by lbdnum  ;
    run;

    data lbd;
	set lbd;
	cfn_lbd=cfn;
        if cbp=1 & flagb~="Death" ;
    run;    

  *Add the fk naics codes;
    data fknaics(keep=lbdnum fk_naics:) ;
	set pdata.naics&y.&ver.;
        %if &y=2012 %then %do;
            fk_naics=fk_naics07;
            random_tot=0 ;
            %end ;
	%if &y<2012 %then %do;
          fk_naics=fk_naics02;  
          random_tot=sum(fk_n97_splits,fk_sic97_splits,fk_sic87_splits);
	  %end;
	   if random_tot=. | random_tot<500;
          label  fk_naics="fk_naics02 if year<2012, fk_naics07 if year>=2012";   
    run;
 
    proc sort data=fknaics;
	by lbdnum;
    run;
    
    data lbd;
	merge lbd(in=a) fknaics;
	by lbdnum;
	if a=1;
    run;

    *merge census and lbd on cfn; 
    proc sort data=lbd;
	by cfn;
    run;
    
    proc sort data=census&y;
	by cfn;
    run;
    
    data estabs&y(drop=sic1) no_lbd_match&y(drop=sic1) all_estabs&y(drop=sic1) ;
	length sic_lbd $ 4 sic $ 4 naics97 $ 6 naics02 $ 6 naics07 $ 6;
	merge census&y(in=a rename=(emp_cen=emp_census firmid=firmid_census)) lbd(in=b);
	by cfn;
       
        *Make industry variables;
	sic_lbd=substr(sic1,1,4);
	sic=sic_census;
	if sic=' ' & sic_lbd~=' ' then sic=sic_lbd;
	if sic=' ' & bestsic~=' ' then sic=substr(bestsic,1,4); 
	naics97=naics97_census;
	if naics97=' ' & naics_lbd~=' ' & &y=1997 then naics97=substr(naics_lbd,1,6);
	naics02=naics02_census;
	if naics02=' ' & naics_lbd~=' ' & &y=2002 then naics02=substr(naics_lbd,1,6);
        naics07=naics07_census;
	if naics07=' ' & naics_lbd~=' ' & &y=2007 then naics07=substr(naics_lbd,1,6);
	naics12=naics12_census;
	if naics12=' ' & naics_lbd~=' ' & &y=2012 then naics12=substr(naics_lbd,1,6);
        
        *Make firmid variable;
	firmid=firmid_census;
	if firmid=' ' then firmid=firmid_lbd;
	%if &y=1997 %then %do;
	    if source='cmi' | source='ccn' then firmid=firmid_lbd;  /*  No alpha found in this census data  */
	    %end;
	
	if left(trim(firmid))='0' then firmid=firmid_lbd;
	if a=0 & b=1 then source="LBD";   
        
        *if Census emp missing, use LBD emp;
	emp=emp_census;
        if emp=. | emp=0 then emp=emp_lbd;
        if a=1 & b=0 then LBD_miss='yes'; 
        if a=0 & b=1 then CEN_miss='yes';   
   
       *Label variables;
       label   LBD_miss="Record missing from LBD"
	       CEN_miss="Record missing from Census"
	       emp_lbd="Emp from LBD"
               emp_census="Emp from Census" 
               EIN="EIN from Census"
               ALPHA="Alpha from Census"
               naics02="1. Naics02 from Census; 2. Naics from LBD";
       
      
       *Output datasets;
	if a=1 then output estabs&y;
	if a=1 | (b=1 & emp_lbd~=0 & emp_lbd~=. ) then output all_estabs&y;
	if a=1 & b=0 then output no_lbd_match&y;
    run;

 
   *export data to stata;
    proc export data=all_estabs&y
    outfile="&output/estabs_sales&y..dta"
    dbms=dta replace;
    run;

   * also need to export to sas?;
    data output.estabs_sales&y;
    set all_estabs&y;
    run;

%mend;
*%firm_emp(1997);
*%firm_emp(2002);
%firm_emp(2007);
*%firm_emp(2012);



***MACRO 3: GET Firm names from the ssel for 2007 ;
************************************************************************************;

%macro firmnames(y);
 

    %let listbr_s= alpha act empunit_id_char ein acqp1 acqp2 acqp3 acqp4 pdiv ac943p name1 name2 street_phy street_mail st_phy st_mail;  
    %let listbr_m= alpha act empunit_id_char ein rap name1 name2 street_phy street_mail st_phy st_mail;  


  data ids(keep=cfn payroll ssel_name1 ssel_name2 ein) ;
	  set 	pdata.ssl&y.mu(keep=&listbr_m rename=(name1=ssel_name1 name2=ssel_name2 street_phy=ssel_street_phy street_mail=ssel_street_mail
                st_phy=ssel_st_phy st_mail=ssel_st_mail) in=a) 
		  pdata.ssl&y.su(keep=&listbr_s rename=(name1=ssel_name1 name2=ssel_name2  street_phy=ssel_street_phy street_mail=ssel_street_mail
                  st_phy=ssel_st_phy st_mail=ssel_st_mail) in=b);
	  length firmid $10 ;

	  *Make CFN variable if on or post 2002;
	  %if &y>=2002 %then %do;
		  cfn = empunit_id_char;
	  %end;

	  *Make firmid variable depending on pre or post 2002;
	  %if &y<2002 %then %do;
		  if substr(cfn,1,1)='0' then firmid=cfn;
		  if substr(cfn,1,1)~='0' then firmid=substr(cfn,1,6)||'0000';
	  %end;
	  %if &y>=2002 %then %do;
		  if alpha=' ' then firmid='0'||ein;
		  if alpha~=' ' then firmid=left(trim(alpha))||"0000";
	  %end;

	  *Make payroll variable;
	  payroll=rap;
	  if b=1 then payroll=acqp1+acqp2+acqp3+acqp4 ;
	  
	  *Only records with a firmid and ein are useful;
	  if firmid^=' ';
	  *if cfn^=' ';
	  if ein^=' ';  
	  if ein^='000000000';
	  if left(trim(ein))~='.';
	  if left(trim(firmid))^='.';
	  
	  *Code to identify active establishments; 
	  if act='D' | act='G' | act='N' then drop=1;  *Drop records that are deletes, ghosts or non-actives;
	  if pdiv='M' then do;
		  drop=1;
		  add_subs=1;  *Submaster records;
	  end;
	  
	  *Additional records to add back in if not conflicting;
	  if a=1 & payroll>0 & act='D' then add_recs=1;
	  if a=1 & payroll>0 &  act='N' then add_recs=1;
          
  *Might be possible for an MU to have act=N and positive payroll, in which case it was a business that closed during year;
	  
	  *Output datasets;
	  if drop~=1;

  run;


  proc sort data=ids;
    by cfn payroll;
  run;

  *duplicate cfns, keep the ones with max payroll;
    data firmnames(keep=cfn ssel: ein);
	set ids;
	by cfn;
	if first.cfn=1;
    run;


  *merge firmnames from ssel to census and lbd data; 
  proc sql; 
  create table tempdata.estabs_sales&y._names as 
  select A.*, ssel.ssel_name1, ssel.ssel_name2, ssel.ein as ssel_ein 
  from all_estabs&y as A 
   left join firmnames as ssel on A.cfn eq ssel.cfn;


    *export data to stata;
    proc export data=tempdata.estabs_sales&y._names
    outfile="&output/estabs_sales&y._names.dta"
    dbms=dta replace;
    run;

%mend;


%firmnames(2007);
* %firmnames(2012);









